In [1]:

    
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

Kaggle - Bike Example Linear Regression

Training and Test Data: https://www.kaggle.com/c/bike-sharing-demand/data

Evaluation: https://www.kaggle.com/c/bike-sharing-demand/details/evaluation

Kaggle site requires free registration in order to download the data

Objective (from Kaggle site):

Dataset consists of hourly rental data spanning two years.
Training set is comprised of the first 19 days of each month
Test set is the 20th to the end of the month.
Model should predict the total count of bikes rented during each hour covered by the test set

This problem was discussed in AWS Machine Learning resources: https://blogs.aws.amazon.com/bigdata/post/Tx2OZ63RJ6Z41A0/Building-a-Numeric-Regression-Model-with-Amazon-Machine-Learning

Strategy:
We will create two separate Linear Regression models:
1. Casual Rental Prediction
2. Registered Rental Prediction
We will then combine the results to find the total count for submission to Kaggle. Competition is already over, but we can submit to check accuracy of the model.

Download Train.csv, Test.csv, samplesubmission.csv files from above Kaggle link and store it in C:\AWSMLCourse\Data\RegressionExamples\BikeTrain

This notebook file reads the Train.csv file and creates two separate training files one for registered user rentals and another file for casual user rentals

Following additional features are added: Month, Day, Hour, DayofWeek

Test.csv file is read and additional features are created

Following files are generated:

Casual Training - bike_rental_train_casual.csv
Registered Training - bike_rental_train_registered.csv
Test - bike_rental_test.csv

Following schema files are provided for creating datasource. Copy these schema files to the S3 bucket/folder where you kept the csv training files.

bike_rental_train_casual.csv.schema
bike_rental_train_registered.csv.schema



In [2]:

    
data_path = r'..\Data\RegressionExamples\BikeTrain'



In [3]:

    
df = pd.read_csv(os.path.join(data_path,'train.csv'), 
                 parse_dates = ['datetime'])



In [4]:

    
df.head()









    Out[4]:







  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
    
  
  
    
      0
      2011-01-01 00:00:00
      1
      0
      0
      1
      9.84
      14.395
      81
      0.0
      3
      13
      16
    
    
      1
      2011-01-01 01:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0
      8
      32
      40
    
    
      2
      2011-01-01 02:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0
      5
      27
      32
    
    
      3
      2011-01-01 03:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0
      3
      10
      13
    
    
      4
      2011-01-01 04:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0
      0
      1
      1



In [5]:

    
# Add some additional features to look at data
df['month'] = df.datetime.dt.month
df['day'] = df.datetime.dt.day
df['hour'] = df.datetime.dt.hour
df['dayofweek'] = df.datetime.dt.dayofweek  # The day of the week with Monday=0, Sunday=6



In [6]:

    
df.groupby([df.dayofweek])['casual'].sum()









    Out[6]:





dayofweek
0    49524
1    43186
2    52689
3    47941
4    48875
5    78654
6    71266
Name: casual, dtype: int64



In [7]:

    
df.groupby([df.dayofweek])['registered'].sum()









    Out[7]:





dayofweek
0    249566
1    241037
2    250465
3    246559
4    246023
5    228480
6    231211
Name: registered, dtype: int64



In [8]:

    
df.casual.describe()









    Out[8]:





count    10886.000000
mean        36.021955
std         49.960477
min          0.000000
25%          4.000000
50%         17.000000
75%         49.000000
max        367.000000
Name: casual, dtype: float64



In [9]:

    
df.registered.describe()









    Out[9]:





count    10886.000000
mean       155.552177
std        151.039033
min          0.000000
25%         36.000000
50%        118.000000
75%        222.000000
max        886.000000
Name: registered, dtype: float64



In [10]:

    
df.corr()



In [11]:

    
# Pre-shuffle the data for consistency
# Shuffled data would be used for training
np.random.seed(5)
l = list(range(df.shape[0]))
np.random.shuffle(l)



In [12]:

    
# randomize
df = df.iloc[l]



In [13]:

    
df.head()









    Out[13]:







  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
      month
      day
      hour
      dayofweek
    
  
  
    
      3650
      2011-03-09 00:00:00
      3
      0
      0
      2
      26.24
      30.305
      73
      7.0015
      22
      65
      87
      3
      9
      0
      2
    
    
      8909
      2012-08-13 14:00:00
      3
      0
      1
      1
      32.80
      34.850
      33
      7.0015
      85
      163
      248
      8
      13
      14
      0
    
    
      4623
      2011-05-11 17:00:00
      4
      0
      0
      1
      15.58
      19.695
      40
      11.0014
      100
      234
      334
      5
      11
      17
      2
    
    
      8818
      2012-09-08 19:00:00
      3
      0
      1
      1
      32.80
      37.880
      55
      12.9980
      90
      533
      623
      9
      8
      19
      5
    
    
      1640
      2011-04-14 06:00:00
      2
      0
      1
      1
      13.94
      17.425
      76
      7.0015
      4
      66
      70
      4
      14
      6
      3



In [14]:

    
# Create the following files:
# We need to create two separate models: 
#   Model 1: casual/non-registered users rental count by hour, 
#   Model 2: registered users rental count by hour

# casual users training data
df.to_csv(os.path.join(data_path,'bike_rental_train_casual.csv'), 
          index = False,
          columns=['datetime', 'month', 'day', 'hour', 'dayofweek', 'season', 'holiday', 
                   'workingday', 'weather', 'temp','atemp', 'humidity', 'windspeed', 'casual'])

# registered users training data
df.to_csv(os.path.join(data_path,'bike_rental_train_registered.csv'), 
          index = False,
          columns=['datetime', 'month', 'day', 'hour', 'dayofweek', 'season', 'holiday', 
                   'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'registered'])

# save all data - for reference
df.to_csv(os.path.join(data_path,'bike_rental_all.csv'), 
          index = False, 
          columns = ['datetime', 'month', 'day', 'hour', 'dayofweek', 'season', 'holiday', 
                     'workingday', 'weather', 'temp','atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'])



In [15]:

    
# Create the test set with new features
# test set provided by kaggle.
df_test = pd.read_csv(os.path.join(data_path,'test.csv'), 
                      parse_dates = ['datetime'])
df_test['month'] = df_test.datetime.dt.month
df_test['day'] = df_test.datetime.dt.day
df_test['hour'] = df_test.datetime.dt.hour
df_test['dayofweek'] = df_test.datetime.dt.dayofweek



In [16]:

    
df_test.to_csv(os.path.join(data_path,'bike_rental_test.csv'), 
               index=False,
               columns=['datetime', 'month', 'day', 'hour', 'dayofweek', 'season', 'holiday', 
                        'workingday', 'weather', 'temp','atemp', 'humidity', 'windspeed'])



In [17]:

    
df_test.shape









    Out[17]:





(6493, 13)



In [18]:

    
df.holiday.value_counts()









    Out[18]:





0    10575
1      311
Name: holiday, dtype: int64



In [19]:

    
df.workingday.value_counts()









    Out[19]:





1    7412
0    3474
Name: workingday, dtype: int64



In [20]:

    
df.weather.value_counts()









    Out[20]:





1    7192
2    2834
3     859
4       1
Name: weather, dtype: int64

Training and Evaluation Summary

Casual Model: Training RMSE: 32.69,Eval RMSE: 31.38, Baseline RMSE: 49.21
Registered Model: Training RMSE: 92.49,Eval RMSE: 92.28, Baseline RMSE: 149.34



In [21]:

    
# read predicted values for casual and registered users.
# sum it up and prepare for submission
df_casual_predicted = pd.read_csv(os.path.join(data_path,
                                               'output_casual',
                                               'bp-jLEzJYrQqDj-bike_rental_test.csv.gz'))



In [22]:

    
df_casual_predicted.head()









    Out[22]:







  
    
      
      tag
      score
    
  
  
    
      0
      2011-01-20 00:00:00
      -20.96109
    
    
      1
      2011-01-20 01:00:00
      -26.02126
    
    
      2
      2011-01-20 02:00:00
      -28.45723
    
    
      3
      2011-01-20 03:00:00
      -29.15169
    
    
      4
      2011-01-20 04:00:00
      -28.62972



In [23]:

    
def adjust_score(x):
    if x < 0:
        return 0
    else:
        return x



In [24]:

    
df_casual_predicted['casual_rental'] = df_casual_predicted.score.map(adjust_score)



In [25]:

    
df_casual_predicted.tail()









    Out[25]:







  
    
      
      tag
      score
      casual_rental
    
  
  
    
      6488
      2012-12-31 19:00:00
      3.942249
      3.942249
    
    
      6489
      2012-12-31 20:00:00
      -5.686156
      0.000000
    
    
      6490
      2012-12-31 21:00:00
      -8.456192
      0.000000
    
    
      6491
      2012-12-31 22:00:00
      -18.586340
      0.000000
    
    
      6492
      2012-12-31 23:00:00
      -25.622160
      0.000000



In [26]:

    
df_casual_predicted.casual_rental.hist(bins = 20)









    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x23b16024438>



In [27]:

    
df_registered_predicted = pd.read_csv(os.path.join(data_path,
                                               'output_registered',
                                               'bp-CnT4idlhEIa-bike_rental_test.csv.gz'))



In [28]:

    
df_registered_predicted['registered_rental'] = df_registered_predicted.score.map(adjust_score)



In [29]:

    
df_registered_predicted.head()









    Out[29]:







  
    
      
      tag
      score
      registered_rental
    
  
  
    
      0
      2011-01-20 00:00:00
      -4.680015
      0.0
    
    
      1
      2011-01-20 01:00:00
      -24.619430
      0.0
    
    
      2
      2011-01-20 02:00:00
      -29.868420
      0.0
    
    
      3
      2011-01-20 03:00:00
      -21.793060
      0.0
    
    
      4
      2011-01-20 04:00:00
      -21.624700
      0.0



In [30]:

    
df_registered_predicted.registered_rental.describe()









    Out[30]:





count    6493.000000
mean      158.140855
std       119.320534
min         0.000000
25%        58.508530
50%       146.273500
75%       233.290400
max       504.923100
Name: registered_rental, dtype: float64



In [31]:

    
df_registered_predicted.registered_rental.hist(bins = 20)









    Out[31]:





<matplotlib.axes._subplots.AxesSubplot at 0x23b164027f0>



In [32]:

    
df_registered_predicted["casual_rental"] = df_casual_predicted.casual_rental



In [33]:

    
df_registered_predicted['count'] = df_registered_predicted.registered_rental + df_registered_predicted.casual_rental



In [34]:

    
df_registered_predicted.tail()









    Out[34]:







  
    
      
      tag
      score
      registered_rental
      casual_rental
      count
    
  
  
    
      6488
      2012-12-31 19:00:00
      243.5891
      243.5891
      3.942249
      247.531349
    
    
      6489
      2012-12-31 20:00:00
      170.1125
      170.1125
      0.000000
      170.112500
    
    
      6490
      2012-12-31 21:00:00
      133.2613
      133.2613
      0.000000
      133.261300
    
    
      6491
      2012-12-31 22:00:00
      63.5018
      63.5018
      0.000000
      63.501800
    
    
      6492
      2012-12-31 23:00:00
      39.6737
      39.6737
      0.000000
      39.673700



In [35]:

    
df_registered_predicted.columns = ['datetime',
                                   'score',
                                   'registered_rental',
                                   'casual_rental',
                                   'count']



In [36]:

    
df_registered_predicted.tail()









    Out[36]:







  
    
      
      datetime
      score
      registered_rental
      casual_rental
      count
    
  
  
    
      6488
      2012-12-31 19:00:00
      243.5891
      243.5891
      3.942249
      247.531349
    
    
      6489
      2012-12-31 20:00:00
      170.1125
      170.1125
      0.000000
      170.112500
    
    
      6490
      2012-12-31 21:00:00
      133.2613
      133.2613
      0.000000
      133.261300
    
    
      6491
      2012-12-31 22:00:00
      63.5018
      63.5018
      0.000000
      63.501800
    
    
      6492
      2012-12-31 23:00:00
      39.6737
      39.6737
      0.000000
      39.673700



In [37]:

    
# File ready to submit on Kaggle
df_registered_predicted.to_csv(os.path.join(data_path,
                                            'predicted_test_kaggle.csv'),
                               index = False,
                               columns = ['datetime', 'count'])

	season	holiday	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count	month	day	hour	dayofweek
season	1.000000	0.029368	-0.008126	0.008879	0.258689	0.264744	0.190610	-0.147121	0.096758	0.164011	0.163439	0.357510	0.388493	-0.006546	-0.013622
holiday	0.029368	1.000000	-0.250491	-0.007074	0.000295	-0.005215	0.001929	0.008409	0.043799	-0.020956	-0.005393	-0.059126	0.022477	-0.000354	-0.079888
workingday	-0.008126	-0.250491	1.000000	0.033772	0.029966	0.024660	-0.010880	0.013373	-0.319111	0.119460	0.011594	0.027378	-0.009560	0.002780	-0.288204
weather	0.008879	-0.007074	0.033772	1.000000	-0.055035	-0.055376	0.406244	0.007261	-0.135918	-0.109340	-0.128655	-0.010926	0.006640	-0.022740	-0.043467
temp	0.258689	0.000295	0.029966	-0.055035	1.000000	0.984948	-0.064949	-0.017852	0.467097	0.318571	0.394454	0.041829	0.151535	0.145430	-0.040066
atemp	0.264744	-0.005215	0.024660	-0.055376	0.984948	1.000000	-0.043536	-0.057473	0.462067	0.314635	0.389784	0.039383	0.153536	0.140343	-0.043954
humidity	0.190610	0.001929	-0.010880	0.406244	-0.064949	-0.043536	1.000000	-0.318607	-0.348187	-0.265458	-0.317371	0.037052	0.094173	-0.278011	-0.006178
windspeed	-0.147121	0.008409	0.013373	0.007261	-0.017852	-0.057473	-0.318607	1.000000	0.092276	0.091052	0.101369	-0.021811	-0.044665	0.146631	-0.026118
casual	0.096758	0.043799	-0.319111	-0.135918	0.467097	0.462067	-0.348187	0.092276	1.000000	0.497250	0.690414	0.012798	0.064500	0.302045	0.117431
registered	0.164011	-0.020956	0.119460	-0.109340	0.318571	0.314635	-0.265458	0.091052	0.497250	1.000000	0.970948	0.063207	0.086072	0.380540	-0.031338
count	0.163439	-0.005393	0.011594	-0.128655	0.394454	0.389784	-0.317371	0.101369	0.690414	0.970948	1.000000	0.056232	0.089556	0.400601	0.006258
month	0.357510	-0.059126	0.027378	-0.010926	0.041829	0.039383	0.037052	-0.021811	0.012798	0.063207	0.056232	1.000000	0.004678	-0.001817	-0.000755
day	0.388493	0.022477	-0.009560	0.006640	0.151535	0.153536	0.094173	-0.044665	0.064500	0.086072	0.089556	0.004678	1.000000	-0.002016	-0.010228
hour	-0.006546	-0.000354	0.002780	-0.022740	0.145430	0.140343	-0.278011	0.146631	0.302045	0.380540	0.400601	-0.001817	-0.002016	1.000000	-0.001007
dayofweek	-0.013622	-0.079888	-0.288204	-0.043467	-0.040066	-0.043954	-0.006178	-0.026118	0.117431	-0.031338	0.006258	-0.000755	-0.010228	-0.001007	1.000000

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1

	datetime	season	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count	month	day	hour	dayofweek
3650	2011-03-09 00:00:00	3	0	2	26.24	30.305	73	7.0015	22	65	87	3	9	0	2
8909	2012-08-13 14:00:00	3	1	1	32.80	34.850	33	7.0015	85	163	248	8	13	14	0
4623	2011-05-11 17:00:00	4	0	1	15.58	19.695	40	11.0014	100	234	334	5	11	17	2
8818	2012-09-08 19:00:00	3	1	1	32.80	37.880	55	12.9980	90	533	623	9	8	19	5
1640	2011-04-14 06:00:00	2	1	1	13.94	17.425	76	7.0015	4	66	70	4	14	6	3

	tag	score
0	2011-01-20 00:00:00	-20.96109
1	2011-01-20 01:00:00	-26.02126
2	2011-01-20 02:00:00	-28.45723
3	2011-01-20 03:00:00	-29.15169
4	2011-01-20 04:00:00	-28.62972

	tag	score	casual_rental
6488	2012-12-31 19:00:00	3.942249	3.942249
6489	2012-12-31 20:00:00	-5.686156	0.000000
6490	2012-12-31 21:00:00	-8.456192	0.000000
6491	2012-12-31 22:00:00	-18.586340	0.000000
6492	2012-12-31 23:00:00	-25.622160	0.000000

	tag	score
0	2011-01-20 00:00:00	-4.680015
1	2011-01-20 01:00:00	-24.619430
2	2011-01-20 02:00:00	-29.868420
3	2011-01-20 03:00:00	-21.793060
4	2011-01-20 04:00:00	-21.624700

	tag	score	registered_rental	casual_rental	count
6488	2012-12-31 19:00:00	243.5891	243.5891	3.942249	247.531349
6489	2012-12-31 20:00:00	170.1125	170.1125	0.000000	170.112500
6490	2012-12-31 21:00:00	133.2613	133.2613	0.000000	133.261300
6491	2012-12-31 22:00:00	63.5018	63.5018	0.000000	63.501800
6492	2012-12-31 23:00:00	39.6737	39.6737	0.000000	39.673700